##Layout
Below is some pseudo-code describing my current plan for the simulations.
Question: do I need to save the datasets? If I set a seed we can re-create them, and I’m not sure why we would need them.
# Inputs:
# Scenarios: Scenarios defining datasets within each simulation
# nSim: The number of simulations within each scenario
# nChains: The number of chains used for CI
# nIter: The numbeer of iterations within each chain
# nBayes: The numbeer of chains run for the Bayesian method
# bayesIter: The number of iterations to run for each Bayesian model chain
# nFreq: The numbeer of different seeds used for the Frequentist method
runSimulations(Scenarios, nSim, nChains, nIter, nBayes, nFreq){
# Loop over scenarios
for scn in Scenarios{
# The various parameters that define the current scenario
(N, P, K, delta_mu, sigma2, pi, P_n) = scn$Parameters
# Objects to hold model performance score
ciScore = array(0, dim = c(nIter, nChains, nSim))
bayesScore = matrix(0, nrow = nBayes, ncol = nSim)
freqScore = matrix(0, nrow = nFreq, ncol = nSim)
# Matrices to hold the upper and lower bounds on the number of clusters
# present based upon samples recorded
nClustLower = array(0, dim = c(nIter, nChains, nSim))
nClustUpper = array(0, dim = c(nIter, nChains, nSim))
bayesNClustLower = matrix(0, nrow = nBayes, ncol = nSim)
bayesNClustUpper = matrix(0, nrow = nBayes, ncol = nSim)
# Loop over the number of simulations
for l in 1:nSim{
# Generate a new dataset for the current simulation based upon the scenario
set.seed(l)
dataset, trueClusters = generateSimulationDataset(N, P, K, delta_mu, sigma2, pi, P_n)
# Create an array to hold the predicted clusterings
predClustering = array(0, dim = c(nIter, N, nChains))
bayesClustering = matrix(0, nrow = nBayes, ncol = N)
freqClustering = matrix(0, nrow = nFreq, ncol = N)
# Loop over the number of chains
for i in 1:nChains{
# set the seed to differentiate chains
set.seed(i)
# run nBayes Bayesian models
if(i <= nBayes){
bayesModel = bayesianMixtureModel(dataset, bayesIter)
ciModel = bayesModel[1:nIter, ]
# Create the PSM
posteriorSimilarityMatrix = makePSM(bayesModel$clusterings)
# Create a summary clustering
bayesClustering[i, ] = clustering(posteriorSimilarityMatrix)
# Uncertainty on the number of clusters present based upon MCMC samples
# (see Sara Wade and Zoubin Ghahramani, 2015).
bayesCredibleBall = credibleball(
bayesClustering[i, ],
bayesModel$clusterings
)
# Record the score of the current Bayesian model
bayesScore[i, l] = ARI(bayesClustering[i, ], trueClusters)
bayesPredK = numberClusters(bayesClustering[i, ])
# Record the the uncertainty on the number of clusters present
bayesNClustLower[i, l] = bayesCredibleBall$lower - bayesPredK
bayesNClustUpper[i, l] = bayesCredibleBall$upper - bayesPredK
} else {
ciModel = bayesianMixtureModel(dataset, nIter)
}
# Initialise the consensus matrix
consensusMatrix = matrix(0, nrow = N, ncol = N)
for j in 1:nIter{
currConsensusMatrix = makeConsensusMatrix(ciModel$clusterings[1:j,])
consensusMatrix = currConsensusMatrix * (1 / i) + consensusMatrix * ((i - 1) / i)
# Need some way of doing this - Paul doesn't agree with use of
# ``mcclust::maxpear``, I have to think about this
predClustering[j, , i] = clustering(consensusMatrix)
# calculate how well the model has performed in uncovering the true
# structure
ciScore[j, i, l] = ARI(predClustering[j, , i], trueClusters)
# The predicted number of cluters present
ciPredK = numberClusters(predClustering[j, , i])
# Not sure that this is appropriate either for the same reasons as
# ``mcclust::maxpear`` might not be; I have to look into this properly.
credibleBallClusters = credibleball(
predClustering[j, , i],
ciModel$clustering[1:j, ]
)
# Record the lower and upper bounds on the number of clusters present
nClustLower[j, i, l] = credibleBallClusters$lower - ciPredK
nClustUpper[j, i, l] = credibleBallClusters$upper - ciPredK
}
# run nFreq frequentist models and record the predicted clustering
if(i <= nFreq){
freqModel = frequentistMixtureModel(dataset)
freqclustering[i, ] = freqModel$clustering
freqScore[i, l] = ARI(freqModel$clustering, trueClusters)
}
}
medCIScore = apply(ciScore[, , l], )
}
# Make a 3D surface plot of the median score of the consensus inference
# results as a function of (nIter, nChains)
plot3D(ciScore)
# Make line plots of the median score bounded by the interquartile range over
# simulations for a subset of number of chains as a funciton of number of
# iterations and for a subset of number of iterations as a funciton of
# number of chains.
subsetChains = c(1, 10, 50, 100)
subsetIter = c(1, 100, 500, 5000)
nChainSubsets = length(subsetChains)
nIterSubsets = length(subsetIter)
facetLinePlot(ciScore, ~nChains:nIter, subsetChains, subsetIter)
# The scores for the subset of interest
ciSubsetScores = ciScore[subsetIter, subsetChains, ]
###Not sure about comparing yet -
###Also, add test (coda::gelman.diag()) for convergence in Bayesian case
# # Compare the median score of the models over simulations:
# for(i in 1:nChainSubsets){
# medCIScoreChain = ciSubsetScores[ , i, ] %>%
# apply(1, median)
# }
#
# for(j in 1:nIterSubsets){
# medCiScoreIter = ciSubsetScores[j, , ] %>%
# apply(1, median)
# }
#
# medBayesScore = bayesScore %>%
# apply(1, median)
#
# medFreqScore = freqScore %>%
# apply(1, median)
#
# # Compare median score over simulations
# boxplot(medCIScoreIter, medCIScoreIter, medBayesScore, medFreqScore)
}
}
My idea.
K <- 5
n <- 100
n_sim <- 10
n_seeds <- 50
n_iter <- 500
score_ci <- array(dim = c(n_sim, n_seeds, n_iter))
true_labels <- matrix(sample(1:K, size = n_sim * n, replace = T), nrow = n_sim)
for(l in 1:n_sim){
true_labels_l <-true_labels[l, ]
ci_out <- list()
ci_cm <- list()
pred_ci <- c()
pred_ci <- matrix(nrow = n_iter, ncol = n)
score_ci_l <- matrix(nrow = n_iter, ncol = n_seeds)
for(j in 1:n_iter){
.old_cm <- matrix(0, nrow=n, ncol=n)
ci_out_j <- matrix(0, nrow = n_seeds, ncol = n)
for(i in 1:n_seeds){
set.seed(i)
ci_out_j[i, ] <- ciSim(true_labels_l, j, K, truth_at = 1500)
# Create coclustering matrix (technically cltoSim creates a posterior similarity
# matrix from a clustering vector, but the effect in *this* case is the same)
.curr_cm_i <- mcclust::cltoSim(ci_out_j[i, ])
# .curr_cm_i <- makePSM(ci_out_j[i, ])
.curr_cm <- .curr_cm_i * (1/i) + .old_cm * ((i - 1) / i)
score_ci[l, i, j] <- .curr_cm %>%
mcclust::maxpear() %>%
.$cl %>%
mcclust::arandi(true_labels_l)
.old_cm <- .curr_cm
}
ci_out[[j]] <- ci_out_j
ci_cm[[j]] <- ci_cm_j <- .curr_cm # makePSM(ci_out_j)
pred_ci[j,] <- ci_cl_j <- mcclust::maxpear(ci_cm_j)$cl
# score_ci[j] <- mcclust::arandi(ci_cl_j, true_labels)
}
# score_ci[[l]] <- score_ci_l
}
Visualisations of results.
probs <- seq(0., 1, 0.25)
for(i in 1:n_iter){
if(i == 1){
surfaces <- apply(score_ci[, , i], 2, quantile, probs = probs)
} else {
surfaces <- cbind(surfaces, apply(score_ci[, , i], 2, quantile, probs = probs))
}
}
surfaces[,1:4]
## [,1] [,2] [,3] [,4]
## 0% 0.02079530 0.02092104 0.01332191 0.02334382
## 25% 0.03887161 0.03685330 0.02236486 0.04215909
## 50% 0.04207585 0.04565432 0.03189869 0.04843739
## 75% 0.06738553 0.06763436 0.04727810 0.05226161
## 100% 0.09367318 0.09367318 0.07071668 0.13768466
z1 <- surfaces[1, ] %>% matrix(nrow = n_iter, ncol = n_seeds, byrow = T)
z2 <- surfaces[2, ] %>% matrix(nrow = n_iter, ncol = n_seeds, byrow = T)
z3 <- surfaces[3, ] %>% matrix(nrow = n_iter, ncol = n_seeds, byrow = T)
z4 <- surfaces[4, ] %>% matrix(nrow = n_iter, ncol = n_seeds, byrow = T)
z5 <- surfaces[5, ] %>% matrix(nrow = n_iter, ncol = n_seeds, byrow = T)
# col_pal <- colorRampPalette(c("#146EB4", "white", "#FF9900"))(100)
plot_ly(colors = col_pal) %>%
add_surface(x = ~1:n_seeds, y = ~1:n_iter, z = ~z2, coloraxis = 'coloraxis', opacity = 0.6) %>%
add_surface(x = ~1:n_seeds, y = ~1:n_iter, z = ~z3, coloraxis = 'coloraxis') %>%
add_surface(x = ~1:n_seeds, y = ~1:n_iter, z = ~z4, coloraxis = 'coloraxis', opacity = 0.6) %>%
layout(
scene = list(
xaxis = list(title = "Number of chains"),
yaxis = list(title = "Number of iterations"),
zaxis = list(title = "ARI predicted clustering to truth")
),
coloraxis= list(colorscale='Viridis') #,
# surfacecolor = list(col_pal),
) %>%
colorbar(title = "ARI")
Facet line plot
plt_z2 <- surface2Df(z2, n_seeds, n_iter)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(n_seeds)` instead of `n_seeds` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
plt_z3 <- surface2Df(z3, n_seeds, n_iter)
plt_z4 <- surface2Df(z4, n_seeds, n_iter)
plt_data <- dplyr::bind_cols(
plt_z3,
plt_z2 %>% dplyr::select(ARI),
plt_z4 %>% dplyr::select(ARI)
) %>%
set_colnames(c("N_iter", "N_chains", "ARI", "ARI_lb", "ARI_ub"))
chains_used <- c(1, 5, 15, 30)
iter_used <- c(1, 10, 50, 100)
N_chain_labs <- c(paste0("Number of chains ", chains_used)) %>%
set_names(chains_used)
# names(cluster_labs) <- 1:5
# Create the plot
p1 <- ggplot(
plt_data %>% filter(N_chains %in% chains_used),
aes(x = as.numeric(N_iter), y = ARI, group = N_chains)
) +
geom_line()+
geom_ribbon(aes(ymin = ARI_lb, ymax = ARI_ub, group = N_chains), colour = NA, alpha =0.3) +
facet_wrap(~N_chains, ncol =1)+
labs(
x = "Number of iterations",
title = "Number of chains"
)
p2 <- ggplot(
plt_data %>% filter(N_iter %in% iter_used),
aes(x = as.numeric(N_chains), y = ARI, group = N_iter)
) +
geom_line()+
geom_ribbon(aes(ymin = ARI_lb, ymax = ARI_ub, group = N_iter), colour = NA, alpha =0.3) +
facet_wrap(~N_iter, ncol =1) +
labs(
x = "Number of chains",
title = "Number of iterations"
)
p1+p2
I was thinking (for the real data) of plotting PCA “time series” (is there a nice term for a series over an ordered variable?). Example:
# Parameters defining generated data
K <- 5
n <- 100
p <- 20
# Generate a dataset
my_data <- generateSimulationDataset(K = K, p = p, n = n)
# PCA
pc1 <- prcomp(my_data$data)
# Include a ribbon about the inter-quantile range
p3 <- pcaSeriesPlot(pc1$x,
labels = my_data$cluster_IDs,
include_area = T,
n_comp = 6
)
# New facet label names for cluster variable
cluster_labs <- c(paste0("Cluster ", 1:5))
names(cluster_labs) <- 1:5
# Create the plot
p3 +
facet_wrap(~Cluster, labeller = labeller(Cluster = cluster_labs)) +
labs(title = "PCA plot including medians and inter-quartile range") +
theme(legend.position="none")
## Warning: Removed 1400 rows containing missing values (geom_point).
## Warning: Removed 1400 row(s) containing missing values (geom_path).
## Warning: Removed 70 row(s) containing missing values (geom_path).